import sys
sys.path.append('../utils/')
from ImageUtils import *
import numpy as np
import pandas as pd # Needs the package Pandas to be installed. Check Anaconda Environments and Packages.
from sklearn.decomposition import PCA # Needs SciKit Learn package to be installed. Check Anaconda Environments and Packages.4
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from mpl_toolkits import mplot3d
faces94_male = readFaces94MaleFaces(gray=True)
faces94_female = readFaces94FemaleFaces(gray=True)
faces94_malestaff = readFaces94MaleStaffFaces(gray=True)
landscapes = np.array(readLandsCapeImage(gray=True))
dataset = np.vstack((faces94_male, faces94_female, faces94_malestaff, landscapes))
labels = np.concatenate((
np.ones(faces94_male.shape[0]),
np.full(faces94_female.shape[0], 2),
# np.full(faces94_malestaff.shape[0], 3),
np.ones(faces94_malestaff.shape[0]),
np.zeros(landscapes.shape[0])
))
dataset_N, height, width = dataset.shape
dataset_norm = dataset/255
mean = np.mean(dataset_norm.reshape(dataset_N, height*width), axis=0).reshape(height, width)
dataset_norm_cov = np.cov(dataset_norm.reshape(dataset_N, height*width))
dataset_norm_cov.shape
_,s,_ = np.linalg.svd(dataset_norm_cov)
representation_percentage = 0.85 # Selected variability
sum_eig = np.sum(s)
percentage_variance = np.divide(s, sum_eig)
sum_var = 0
num_var = 0
for i in np.arange(percentage_variance.shape[0]):
if sum_var >= representation_percentage:
num_var = i
break;
sum_var += percentage_variance[i]
num_var
cum_per=np.cumsum(percentage_variance)
for i in range(1,len(s)):
change=(cum_per[i]-cum_per[i-1])/cum_per[i-1]*100
if(change<.01):
num_var1=i-1
print("First",num_var1, "components with ",cum_per[num_var1]*100,"percent of variability captured and from which the contribution is less than 0.01%")
break
plt.figure(figsize=(12,6))
plt.plot(cum_per*100)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Cumulative Summation of the Explained Variance')
plt.show()
pca = PCA(n_components=num_var, svd_solver='full').fit(dataset.reshape(dataset_N, height*width))
pca.components_.shape
cols = 4
rows = 4
plt.figure(figsize=(30,20))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.imshow(pca.components_[i].reshape(height, width), plt.cm.gray)
dataset_male = np.vstack((faces94_male, faces94_malestaff))
dataset_male.shape
mean_male = np.mean(dataset_male.reshape(dataset_male.shape[0], height*width)/255, axis=0).reshape(height, width)
mean_female = np.mean(faces94_female.reshape(faces94_female.shape[0], height*width)/255, axis=0).reshape(height, width)
mean_landscape = np.mean(landscapes.reshape(landscapes.shape[0], height*width)/255, axis=0).reshape(height, width)
fig = plt.figure(figsize=(10,6))
ax1 = fig.add_subplot(1,3,1)
plt.title("Mean Male")
ax1.imshow(mean_male*255, plt.cm.gray)
ax2 = fig.add_subplot(1,3,2)
plt.title("Mean Female")
ax2.imshow(mean_female*255, plt.cm.gray)
ax3 = fig.add_subplot(1,3,3)
plt.title("Mean Landscapes")
ax3.imshow(mean_landscape*255, plt.cm.gray)
male_cov = np.cov(np.subtract(dataset_male/255, mean_male).reshape(dataset_male.shape[0], height*width))
male_cov.shape
female_cov = np.cov(np.subtract(faces94_female/255, mean_female).reshape(faces94_female.shape[0], height*width))
female_cov.shape
landscape_cov = np.cov(np.subtract(landscapes/255, mean_landscape).reshape(landscapes.shape[0], height*width))
landscape_cov.shape
landscape_base_matrix = np.ones((landscapes.shape[0], height*width))
male_base_matrix = np.ones((dataset_male.shape[0], height*width))
female_base_matrix = np.ones((faces94_female.shape[0], height*width))
dataset_projected = pca.transform(dataset.reshape(dataset_N, height*width))
dataset_projected.shape
pca.explained_variance_ratio_
#k-means
kmeans = KMeans(n_clusters=3, random_state=42).fit(dataset_projected)
wcentroids=kmeans.cluster_centers_
wcentroids.shape
cols = 3
rows = 1
plt.figure(figsize=(10,6))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class "+str(i+1))
plt.imshow((np.dot(kmeans.cluster_centers_[i],pca.components_)+mean.reshape(height*width)).reshape(height, width), plt.cm.gray)
y_label=kmeans.labels_
wtotaldist=kmeans.transform(dataset_projected)
wdistances = np.amin(wtotaldist, axis=1)
print(wdistances.shape[0])
#class 1 k-means
kclass=0
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
histbox(wdistances[y_label==kclass])
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
#low distances class1
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class1 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=13)
plt.imshow(dataset[df2.index[i]], plt.cm.gray)
#High distances class1
cols = 4
rows = 1
plt.figure(figsize=(15,10))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class1 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=10)
plt.imshow(dataset[df2.index[-(i+1)]], plt.cm.gray)
#class 2 k-means
kclass=1
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
histbox(wdistances[y_label==kclass])
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
#low distances class2
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class2 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=13)
plt.imshow(dataset[df2.index[i]], plt.cm.gray)
#High distances class2
cols = 4
rows = 1
plt.figure(figsize=(15,10))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class2 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=10)
plt.imshow(dataset[df2.index[-(i+1)]], plt.cm.gray)
#class 3 k-means
kclass=2
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
histbox(wdistances[y_label==kclass])
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
#low distances class3
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class3 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=13)
plt.imshow(dataset[df2.index[i]], plt.cm.gray)
#High distances class3
cols = 4
rows = 1
plt.figure(figsize=(15,10))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class3 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=10)
plt.imshow(dataset[df2.index[-(i+1)]], plt.cm.gray)
labelsk = np.concatenate((
np.full(faces94_male.shape[0],2),
np.ones(faces94_female.shape[0]),
np.full(faces94_malestaff.shape[0],2),
np.zeros(landscapes.shape[0])
))
cm=confusion_matrix(labelsk, y_label).ravel()
plt.figure()
plt.title("Heatmap")
prediction_data = {'y_Actual': labelsk,'y_Predicted': y_label}
df = pd.DataFrame(prediction_data, columns=['y_Actual','y_Predicted'])
confusionmatrix1 = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(confusionmatrix1, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'female', 'male']); ax.yaxis.set_ticklabels(['landscape', 'female', 'male']);
ax.invert_yaxis()
accuracy_score(y_true=labelsk, y_pred=y_label)
precision_score(y_true=labelsk, y_pred=y_label, average=None)
print(classification_report(y_true=labelsk, y_pred=y_label, target_names=["landscape", "woman", "man"]))
Y=kmeans.transform(dataset_projected)
plt.figure(figsize=(10,8))
ax = plt.axes(projection='3d')
ax.scatter(Y[np.where(y_label==0),0],Y[np.where(y_label==0),1] ,Y[np.where(y_label==0),2], cmap='viridis', linewidth=1);
ax.scatter(Y[np.where(y_label==1),0],Y[np.where(y_label==1),1] ,Y[np.where(y_label==1),2], cmap='viridis', linewidth=1);
ax.scatter(Y[np.where(y_label==2),0],Y[np.where(y_label==2),1] ,Y[np.where(y_label==2),2], cmap='viridis', linewidth=1);
plt.gca().legend(('class 1','class 2','class 3'))
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(dataset_projected, labels)
dataset_lda = lda.transform(dataset_projected)
colors = ['navy', 'turquoise', 'darkorange']
classes = ['landscapes', 'male', 'female']
plt.figure(figsize=(10,8))
for color, i, class_name in zip(colors, np.arange(0, 3), classes):
plt.scatter(dataset_lda[labels == i, 0], dataset_lda[labels == i, 1], alpha=.8, color=color,
label=class_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.xlabel('LDA Eigenvector 1')
plt.ylabel('LDA Eigenvector 2')
plt.title('LDA of EigenFaces distribution')
lda.explained_variance_ratio_
lda.predict(dataset_projected)
CVresult={'lda':dataset_lda[:,0],'label':lda.predict(dataset_projected)}
df = pd.DataFrame(CVresult)
df.sort_values('lda', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==0]
df2.head()
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class1 LDA "+ str(df2['lda'][df2.index[i]]),fontsize=13)
plt.imshow(dataset[df2.index[i]], plt.cm.gray)
CVresult={'lda':dataset_lda[:,1],'label':lda.predict(dataset_projected)}
df = pd.DataFrame(CVresult)
df.sort_values('lda', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==1]
df2.head()
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class2 LDA "+ str(df2['lda'][df2.index[i]]),fontsize=13)
plt.imshow(dataset[df2.index[i]], plt.cm.gray)
CVresult={'lda':dataset_lda[:,1],'label':lda.predict(dataset_projected)}
df = pd.DataFrame(CVresult)
df.sort_values('lda', axis = 0, ascending = False, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==2]
df2.head()
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
plt.subplot(rows, cols, i + 1)
plt.title("Class3 LDA "+ str(df2['lda'][df2.index[i]]),fontsize=13)
plt.imshow(dataset[df2.index[i]], plt.cm.gray)
perplexities = [5, 30, 50, 100]
(fig, subplots) = plt.subplots(1, 4, figsize=(20, 8))
plt.axis('tight')
landscapes_class = 0
male_class = 1
female_class = 2
for i, perplexity in enumerate(perplexities):
ax = subplots[i]
tsne = TSNE(n_components=2, init='random',
random_state=0, perplexity=perplexity)
dataset_tsne = tsne.fit_transform(dataset_projected)
landscapes_idx = labels == landscapes_class
male_idx = labels == male_class
female_idx = labels == female_class
ax.set_title("t-SNE Eigenfaces Perplexity=%d" % perplexity)
ax.scatter(dataset_tsne[landscapes_idx, 0], dataset_tsne[landscapes_idx, 1], c=colors[landscapes_class])
ax.scatter(dataset_tsne[male_idx, 0], dataset_tsne[male_idx, 1], c=colors[male_class])
ax.scatter(dataset_tsne[female_idx, 0], dataset_tsne[female_idx, 1], c=colors[female_class])
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
ax.axis('tight')
X_train, X_test, y_train, y_test = train_test_split(dataset_projected, labels, test_size=0.3, stratify=labels)
classifier = LogisticRegression(solver='newton-cg', multi_class='multinomial')
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_test_pred)
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()
classifier_lda = LinearDiscriminantAnalysis(n_components=2)
classifier_lda.fit(X_train, y_train)
y_test_pred = classifier_lda.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_test_pred)
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()
X_train, X_test, y_train, y_test = train_test_split(dataset_lda, labels, test_size=0.3, stratify=labels)
classifier = LogisticRegression(solver='newton-cg', multi_class='multinomial')
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_test_pred)
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()
tsne = TSNE(n_components=2, init='random',
random_state=0, perplexity=80)
dataset_tsne = tsne.fit_transform(dataset_projected)
dataset_tsne.shape
X_train, X_test, y_train, y_test = train_test_split(dataset_tsne, labels, test_size=0.3, stratify=labels)
classifier = LogisticRegression(solver='newton-cg', multi_class='multinomial')
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
accuracy_score(y_true=y_test, y_pred=y_test_pred)
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()
linkage_matrix = linkage(y=dataset_lda, method='weighted')
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
linkage_matrix,
p=3,truncate_mode='level'
)
plt.show()